12_python爬虫——下载个人CSDN博客内容

最新推荐文章于 2024-05-02 21:47:38 发布

枫奇

最新推荐文章于 2024-05-02 21:47:38 发布

阅读量5.1k

点赞数 1

分类专栏： python爬虫文章标签： python CSDN博客爬虫

本文链接：https://blog.csdn.net/qiqiyingse/article/details/67641261

版权

python爬虫专栏收录该内容

32 篇文章 1 订阅

订阅专栏

下载个人博客内容

可以是主页的内容，也可以是每个分类下的内容

只需要把传入的URL地址修改一下就OK了

但是修改传入的URL时，记得检查一下如果传入的URL 不带’?viewmode=contents‘，那么只能得到五篇内容

#coding:utf-8
import webbrowser as web
import os
import time
import random
import urllib2,sys
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf-8')

print '''
	本文下载CSDN个人博客下的内容
'''


#此地址根据实际情况修改，但是，记得最后要有 '?viewmode=contents' 否则每页显示内容有限
url = 'http://blog.csdn.net/qiqiyingse/article/category/6292432?viewmode=contents'
baseurl='http://blog.csdn.net'


def getPage(url): #伪装成浏览器登陆,获取网页源代码
	
	headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}  
	req = urllib2.Request(url=url,headers=headers)
	try:
		html = urllib2.urlopen(req).read()
	except urllib2.HTTPError,e:
		print e.code
		print e.reason
        #将网页内容传给BeautifulSoup解析
	return html


def geturl(html,url):
	urlList=[]
	#print html
	page = BeautifulSoup(html,'lxml')
	items = page.find_all('div',class_ ='list_item list_view')#找到每一个文章item
	'''
	if 'categor' in url:
		items = page.find_all('div',class_ ='list_item article_item')
	else:
		items = page.find_all('div',class_ ='list_item list_view')
	'''
	print len(items)
	for item in items:
		content=item.find('a')
		url=content.get('href')#找到每一个文章的连接
		url=baseurl+url#拼接成可访问的地址
		urlList.append(url)
	return urlList

def getContent(html):
	page = BeautifulSoup(html,'lxml')
	
	try:
		title=page.find('div',class_='article_title').find('a').text
		title=title.strip()
		print title
	except e:
		print e
	
	try:
		content=page.find('div',class_='article_content')
		#print content.text
		with open(title+'.txt','w') as f:
			f.write(content.text)
	except e:
		print e




html=getPage(url)
urls=geturl(html,url)

count=0
while count<len(urls):#根据文章列表的url决定循环次数，
	print (urls[count])
	htmltest=getPage(urls[count])
	getContent(htmltest)
	count=count+1

重新追加一版

这个版本修正了一些错误

#coding:utf-8  
import urllib2,re,time,random,os,datetime
import HTMLParser  
from bs4 import BeautifulSoup  
import sys  
reload(sys)    
sys.setdefaultencoding('utf-8')  
  
 
  
#自定义打印函数  
def self_log(msg):  
    print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)  
  
#获取页面内容  
def  get_html(url):  
    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}  
    req = urllib2.Request(url=url,headers=headers)  
    try:  
        html = urllib2.urlopen(req).read()
	html=HTMLParser.HTMLParser().unescape(html)
        return html  		
    except urllib2.HTTPError,e:  
        print e.code  

      
#得到博客页面总数  
def get_last_page(html,fd):  
    if not html:  
        self_log(u'页面错误，停止运行')   
        return  
    page = BeautifulSoup(html,'lxml')  
    try:
        last_page=page.find('div',class_ ='pagelist').find_all('a')  
        last_page= last_page[len(last_page)-1].get('href')[-1:]  
        self_log('总共有%s 页博客' % last_page)  
        fd.write('总共有%s 页博客\n' % last_page)
        return last_page  
    except Exception,e:  
        return 1  

      
#获取页面列表  
def get_items(url):  
    content_html=get_html(url)  
    page = BeautifulSoup(content_html,'lxml')  
    items = page.find_all('div',class_ ='list_item list_view')  
    return items  
  
#根据每一个items list 提取需要的元素  
def handle_items(items,content_list,read_num_for_sort):  
    for item in items:  
        temp={}#临时变量  
          
        title=item.find('a')#标题  
        content_url='http://blog.csdn.net'+title.get('href')#标题对应文章的地址  
        read_times=item.find('span',class_ ='link_view').text.strip()#阅读次数  
        comments_time=item.find('span',class_ ='link_comments')#评论次数  
          
        read_number = int(filter(str.isdigit, str(read_times))) #提取出来具体阅读次数的数字，为之后的排序做准备  
        read_num_for_sort.append(read_number)  
  
        #将数据打包  
        temp['indexs']=read_number  
        temp['title']=title.text.strip()  
        temp['read_times']=read_times  
        temp['comments_time']=comments_time.text.strip()  
        temp['content_url']=content_url  
        content_list.append(temp)  
  
#创建文件夹  
def mkdir_folder(path):  
    if not os.path.exists(path):    
        os.makedirs(path)   
  
#获取页面信息  
def getContent(html,dir_path):  
    page = BeautifulSoup(html,'lxml')  
    try:  
        title=page.find('div',class_='article_title').find('a').text  
        title=title.strip()  
    except Exception,e:  
        print e  
    try:  
        content=page.find('div',class_='article_content')  
        dir_path=dir_path  
        artitle_name_path=dir_path+'/'+title+'.txt'  
        with open(artitle_name_path+'.txt','w') as f:  
            f.write(content.text)  
        self_log(u'存贮文章：%s 完毕' % title)  
    except Exception,e:  
        print e  
  
#存贮每一篇文章到本地  
def run_to_get_article(content_total_list,dir_path):  
    self_log('start save every article  ')  
    for article_content in content_total_list:  
        article_url=article_content.split('|')[4]  
        self_log( '将要存贮的地址是： %s ...' % article_url)  
        artitle_html=get_html(article_url)  
        getContent(artitle_html,dir_path)  
    
#根据传进来的地址，获取博主名字，同时以博主名字命名存贮目录	
def get_blocker_name(url):
    if 'viewmode' in url:
        print url.split('.net')[1]
        print url.split('.net')[1].split('?')[0].split('/')[1]
        return url.split('.net')[1].split('?')[0].split('/')[1]
    else:
        print url.split('.net')[1]
        print url.split('.net')[1].split('/')[1]          
        return url.split('.net')[1].split('/')[1]          
#程序运行主函数          
def run(url,dir_path):  
    read_num_for_sort=[]  
    content_list=[]  
    content_total_list=[]  
      
    #定义文件夹名字并创建文件夹  
    dir_path=dir_path
    mkdir_folder(dir_path)  
      
    #定义文件名字  
    count_file_name=dir_path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'  
    fd=open(count_file_name,'w')  
      
    #1.从主页进入获取页面总数  
    main_html=get_html(url)  
    last_page=get_last_page(main_html,fd)  
      

    if  last_page>1:
    #3.组装url，分别加载每页的页面,同时在每一个页面提取我们需要的内容  
        for i in range(1,int(last_page)+1):  
            if  'category' not in url:
                main_url=url.split('?')[0]+'/article/list/%d?viewmode=contents' % i 
            else:
                main_url=url+'/%s' % i
            self_log('即将获取第%d页的内容，地址是：%s' % (i,main_url))  
                  
            items=get_items(main_url)#获取每一页的页面内容，根据页面内容得到文章item list  
            handle_items(items,content_list,read_num_for_sort)#处理item list  
    else:
        items=get_items(url)#获取每一页的页面内容，根据页面内容得到文章item list  
        handle_items(items,content_list,read_num_for_sort)#处理item list  
    self_log('总共有%d 篇文章' % len(content_list))#根据得到的数据，统计文章总数  
    #根据 indexs（阅读次数）这个索引值进行排序  
    #非常好的一个根据列表中字典数据进行排序的方法  
    content_list = sorted(content_list,cmp=lambda x,y:cmp(x['indexs'],y['indexs']),reverse=0)  
      
    article_index = 1  
    for a in content_list:  
        #组装打印语句  
        totalcontent= '第'+str(article_index)+'篇|'+a['title']+'|'+a['read_times']+'|'+a['comments_time']+'|'+a['content_url']  
        #self_log(totalcontent)  
        print totalcontent  
        #将其存贮到本地  
        fd.write(totalcontent)  
        fd.write('\n')  
        article_index +=1  
        content_total_list.append(totalcontent)  
    fd.close()        
  
    return content_total_list  
      
if __name__ == '__main__':   
    print '''
            *****************************************  
            **    Welcome to Spider of Count CSDN  **  
            **      Created on 2017-05-07          **  
            **      @author: Jimy_Fengqi           **  
            *****************************************
			'''

    url='http://blog.csdn.net/qiqiyingse?viewmode=contents'  
    #url='http://blog.csdn.net/qiqiyingse/article/category/6292432?viewmode=contents' 
    #url='http://blog.csdn.net/zuoxiaolong8810/article/category/1434962?viewmode=contents' 
    dir_path=get_blocker_name(url)
    content_total_list=run(url,dir_path)  
    run_to_get_article(content_total_list,dir_path)